
# coding: utf-8

# In[74]:


from urllib import request,parse
import json
import time
import random
import hashlib
import urllib
3import re
# page = int(input())
# url = "http://sz.58.com/chuzu/pn"+str(page)+"/?ClickID=1"
# print(url)
#尝试用分页抓取，但是失败了，用作业提示的那个地址抓取出来的数据跟页面不一致
url = "http://sz.58.com/chuzu/?PGTID=0d100000-0000-4d83-ec50-6954d9295361&ClickID=2"


# In[75]:


res = urllib.request.urlopen(url)
html = res.read().decode("utf-8")
print(len(html))


# In[76]:


#爬取除图片外的信息
pat = '<a href.*?tongji_label="listclick".*?target="_blank"  rel="nofollow" >\s+(.*?)\s+</a>.*?<p class="room">(.*?)\s+&nbsp;&nbsp;&nbsp;&nbsp;(.*?)</p>'
dist = re.findall(pat,html,re.S) 


# In[79]:


#因为图片一起爬取会很慢，所以分开了
pat = '<div class="img_list">.*?lazy_src="(.*?)"\s+src=".*?ng">'
dist1 = re.findall(pat,html,re.S) 


# In[80]:


#将两部分拼在一起
for i in range(0,len(dist)):
    dist[i] = list(dist[i])
    dist[i].insert(3,dist1[1])


# In[81]:


for v in dist:
    print(v)

